library(phyloseq) # for phyloseq object
library(ggplot2)
library(cowplot)
library(plyr)
library(dplyr)
library("plotly") # plot 3D
library("microbiome") # for centered log-ratio
library("coda") # Aitchison distance
library("coda.base") # Aitchison distance
library("vegan") # NMDS
library(pheatmap) # for heatmap
# Set path
path <- "~/Projects/IBS_Meta-analysis_16S"
path.plots <- file.path(path, "data/analysis-individual/PLOTS/plots-Ringel-EDA")
# Import phyloseq object
physeq.ringel <- readRDS(file.path(path, "phyloseq-objects/physeq_ringel.rds"))
# Sanity check
physeq.ringel
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 3261 taxa and 75 samples ]
## sample_data() Sample Data: [ 75 samples by 8 sample variables ]
## tax_table() Taxonomy Table: [ 3261 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 3261 tips and 3259 internal nodes ]
## refseq() DNAStringSet: [ 3261 reference sequences ]
Phylogenetic tree was computed with the package phangorn, and the script was run on a cluster. Let’s check we have correctly generated a phylogenetic tree.
# Look at the tree
plot_tree(physeq.ringel, color = "Phylum", ladderize="left")
# Plot Phylum
plot_bar(physeq.ringel, fill = "Phylum") +
theme(axis.text.x = element_blank())+
labs(x = "Samples", y = "Absolute abundance", title = "Ringel-Kulka dataset")
# ggsave(file.path(path.plots, "absAbundance_phylum.jpg"), width=13, height=5)
Sequencing depth characteristics of the Zhuang dataset:
- minimum of 2319 total count per sample
- median: 7298 total count per sample
- maximum of 1.411110^{4} total count per sample
# Agglomerate to phylum & class levels
phylum.table <- physeq.ringel %>%
tax_glom(taxrank = "Phylum") %>% # agglomerate at phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
psmelt() # Melt to long format
class.table <- physeq.ringel %>%
tax_glom(taxrank = "Class") %>%
transform_sample_counts(function(x) {x/sum(x)} ) %>%
psmelt()
# Plot relative abundances
ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Firmicutes', 'Abundance'])),
y = Abundance, fill = Phylum))+
geom_bar(stat = "identity") +
theme(axis.text.x = element_blank())+
labs(x = "Samples", y = "Relative abundance", title = "Ringel-Kulka dataset")
# ggsave(file.path(path.plots, "relAbundance_phylum.jpg"), width=10, height=5)
ggplot(class.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Firmicutes', 'Abundance'])),
y = Abundance, fill = Class))+
geom_bar(stat = "identity") +
theme(axis.text.x = element_blank())+
labs(x = "Samples", y = "Relative abundance", title = "Ringel-Kulka dataset")
# ggsave(file.path(path.plots, "relAbundance_class.jpg"), width=12, height=5)
# Sanity check no sample with less than 500 total count
table(sample_sums(physeq.ringel)<500) # all FALSE
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH NON-ZERO COMPOSITIONS
physeq.NZcomp <- physeq.ringel
otu_table(physeq.NZcomp)[otu_table(physeq.NZcomp) == 0] <- 0.5 # pseudocounts
# Sanity check that 0 values have been replaced
# otu_table(physeq.ringel)[1:5,1:5]
# otu_table(physeq.NZcomp)[1:5,1:5]
# transform into compositions
physeq.NZcomp <- transform_sample_counts(physeq.NZcomp, function(x) x / sum(x) )
table(rowSums(otu_table(physeq.NZcomp))) # check if there is any row not summing to 1
# Save object
saveRDS(physeq.NZcomp, file.path(path, "data/analysis-individual/Ringel-2015/03_EDA-Ringel/physeq_NZcomp.rds"))
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.ringel
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) ) # divide each count by the total number of counts (per sample)
# check the counts are all relative
# otu_table(physeq.ringel)[1:5, 1:5]
# otu_table(physeq.rel)[1:5, 1:5]
# sanity check
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1
# save the physeq.rel object
saveRDS(physeq.rel, file.path(path, "data/analysis-individual/Ringel-2015/03_EDA-Ringel/physeq_relative.rds"))
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH COMMON-SCALE NORMALIZATION
physeq.CSN <- physeq.ringel
physeq.CSN <- transform_sample_counts(physeq.CSN, function(x) (x*min(sample_sums(physeq.CSN))) / sum(x) )
# sanity check
table(rowSums(otu_table(physeq.CSN))) # check that all rows are summing to the same total
# save the physeq.CSN object
saveRDS(physeq.CSN, file.path(path, "data/analysis-individual/Ringel-2015/03_EDA-Ringel/physeq_CSN.rds"))
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.ringel
physeq.clr <- microbiome::transform(physeq.ringel, "clr") # the function adds pseudocounts itself
# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
# otu_table(physeq.ringel)[1:5, 1:5] # should contain absolute counts
# otu_table(physeq.clr)[1:5, 1:5] # should all be relative
# save the physeq.rel object
saveRDS(physeq.clr, file.path(path, "data/analysis-individual/Ringel-2015/03_EDA-Ringel/physeq_clr.rds"))
We will be looking at four distances of interest: UniFrac, Aitchison, Bray-Curtis and Canberra.
#____________________________________________________________________________________
# Measure distances
getDistances <- function(){
set.seed(123) # for unifrac, need to set a seed
glom.UniF <- UniFrac(physeq.rel, weighted=TRUE, normalized=TRUE) # weighted unifrac
glom.ait <- phyloseq::distance(physeq.clr, method = 'euclidean') # aitchison
glom.bray <- phyloseq::distance(physeq.CSN, method = "bray") # bray-curtis
glom.can <- phyloseq::distance(physeq.NZcomp, method = "canberra") # canberra
dist.list <- list("UniF" = glom.UniF, "Ait" = glom.ait, "Canb" = glom.can, "Bray" = glom.bray)
return(dist.list)
}
#____________________________________________________________________________________
# Plot in 2D the distances
plotDistances2D <- function(dlist, ordination="MDS"){
plist <- NULL
plist <- vector("list", 4)
names(plist) <- c("Weighted Unifrac", "Aitchison", "Bray-Curtis", "Canberra")
print("Unifrac")
# Weighted UniFrac
set.seed(123)
iMDS.UniF <- ordinate(physeq.rel, ordination, distance=dlist$UniF)
plist[[1]] <- plot_ordination(physeq.rel, iMDS.UniF)
print("Aitchison")
# Aitchison
set.seed(123)
iMDS.Ait <- ordinate(physeq.clr, ordination, distance=dlist$Ait)
plist[[2]] <- plot_ordination(physeq.clr, iMDS.Ait)
print("Bray")
# Bray-Curtis
set.seed(123)
iMDS.Bray <- ordinate(physeq.CSN, ordination, distance=dlist$Bray)
plist[[3]] <- plot_ordination(physeq.CSN, iMDS.Bray)
print("Canberra")
# Canberra
set.seed(123)
iMDS.Can <- ordinate(physeq.NZcomp, ordination, distance=dlist$Can)
plist[[4]] <- plot_ordination(physeq.NZcomp, iMDS.Can)
# Creating a dataframe to plot everything
plot.df = ldply(plist, function(x) x$data)
names(plot.df)[1] <- "distance"
return(plot.df)
}
Now let’s plot!
# Get the distances & the plot data
dist.ringel <- getDistances()
plot.df <- plotDistances2D(dist.ringel)
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
ggplot(plot.df, aes(Axis.1, Axis.2))+
geom_point(size=4, alpha=0.5) +
facet_wrap(distance~., scales='free', nrow=1)+
theme_bw()+
theme(strip.text.x = element_text(size=20))
# ggsave(file.path(path.plots, "distances4_MDS.jpg"), height = 4, width = 15)
# Function to get heatmap from the distances computed
plotHeatmaps <- function(dlist, fontsize){
# Initialize variables
i=1
plist <- vector("list", 4)
names(plist) <- names(dlist)
# Loop through distances
for(d in dlist){
plist[[i]] <- pheatmap(as.matrix(d),
clustering_distance_rows = d,
clustering_distance_cols = d,
fontsize = fontsize,
fontsize_col = fontsize-5,
fontsize_row = fontsize-5,
# annotation_col = matcol,
# annotation_row = matcol,
# annotation_colors = list(host_disease = c('Healthy' = 'blue', 'IBS' = 'red')),
cluster_rows = T,
cluster_cols = T,
clustering_method = 'complete', # hc method
main = names(dlist)[i]) # have name of distance as title
i <- i+1
}
return(plist)
}
# Get the heatmaps
heatmp.ringel <- plotHeatmaps(dlist = dist.ringel, fontsize = 8)